This notebook is to inspect, load and modify some of the code from the ENTs paper's tar file. The aim to produce a pickled object to return features for given protein pairs in the way it was done for Gene Ontology features.
The data is loaded in the code in the file run_genome_prob.py
.
The code to do this is relatively short and has been loaded into the cell below.
In [1]:
cd ../../ents/standalone/
In [2]:
import pred_file_maker
from multiloc2 import MultiLoc2_highres
from DataFrame import DataFrame
from R_utilities import R_utilities as R
from optparse import OptionParser
from multiprocessing import Process
import os
import numpy as np
In [3]:
def get_n_unique_prots(subcell_file, prot_dict_file_name):
subcell_info = MultiLoc2_highres(subcell_file)
subcell_info = dict((k.split('.')[0],v) for k,v in subcell_info.gene_dict.iteritems())
prot_dict_file = open(prot_dict_file_name)
count = 0
for line in prot_dict_file:
line = line.strip().split('\t')
if line[1] not in subcell_info: continue
else: count += 1
return count
In [4]:
# define file names
subcell_file = "h_sapiens_subcellular_multiloc2.out"
odds_file_name = "domain_odds.in"
domine_file_name = "domine_flat_file.in"
prot_dict_file_name = "H_sapiens_domains.out"
In [5]:
### Open the files ###
print "Reading input files..."
subcell_info = MultiLoc2_highres(subcell_file)
org_type = subcell_info.org_type
# Create dictionary of gene_name -> info for the subcellular localization
subcell_info = dict((k.split('.')[0],v) for k,v in subcell_info.gene_dict.iteritems())
odds_file = open(odds_file_name)
# Create dictionary of domain_pair -> odds_of_interaction_association
odds_dict = {}
for line in odds_file:
line = line.strip().split(':')
odds = float(line[1])
pair = line[0].split(',')
odds_dict[tuple(sorted(pair))] = odds
odds_file.close()
print "Done."
### Get essential protein information ###
print "Getting protein information..."
# Get all organism genes
proteins = []
# Dictionary from gene_stable_id -> protein_stable_id and the other way around
gene_protein_dict = {}
protein_gene_dict= {}
protein_domain_dict = {}
prot_dict_file = open(prot_dict_file_name)
for line in prot_dict_file:
line = line.strip().split('\t')
if line[1] not in subcell_info: continue
proteins.append(line[0])
gene_protein_dict[line[1]] = line[0]
protein_gene_dict[line[0]] = line[1]
if len(line) ==3:
protein_domain_dict[line[0]] = line[2].split(' ')
else: protein_domain_dict[line[0]] = []
prot_dict_file.close()
# Get domine interactions as dictionary of domine_pair -> confidence
domine_interactions = {}
domine = open(domine_file_name)
for line in domine:
line = line.strip().split('\t')
domine_interactions[tuple(sorted(line[0].split(',')))] = line[1]
print "Done."
The code originally uses these parsed data files to then write a file which is input to R through Rserve. Looking at this code we can use this same data and create feature vectors for arbitrary protein pairs. The code to write to file is as follows:
for line in pair_file:
line = line.strip().split('\t')
# Get the domain part of the string
protein1 = line[0]
protein2 = line[1]
if protein1 not in pair_prots: pair_prots.append(protein1)
if protein2 not in pair_prots: pair_prots.append(protein2)
try:
domain_string = makeDomainString(protein_domain_dict[protein1], protein_domain_dict[protein2],\
domine_interactions, odds_dict)
except:
key1 = [x for x in protein_domain_dict.keys() if protein1.startswith(x.split('.')[0])]
key2 = [x for x in protein_domain_dict.keys() if protein2.startswith(x.split('.')[0])]
if len(key1) == 0 or len(key2) == 0: continue
else:
domain_string = makeDomainString(protein_domain_dict[key1[0]], protein_domain_dict[key2[0]],\
domine_interactions, odds_dict)
# Get the subcellular localization part of the string
try: subcell_string = makeSubcellularDict(protein1, protein2, subcell_info, protein_gene_dict)
except KeyError:
key1 = [x for x in protein_domain_dict.keys() if protein1.startswith(x.split('.')[0])]
key2 = [x for x in protein_domain_dict.keys() if protein2.startswith(x.split('.')[0])]
if len(key1) == 0 or len(key2) == 0: continue
else:
try: subcell_string = makeSubcellularDict(key1[0], key2[0],subcell_info, protein_gene_dict)
except: continue
out_file.write("%s\t%s\t%s\t%s\n" % (protein1,protein2,domain_string,subcell_string))
Taking this out of the loop and getting rid of the file references while handing in the protein names to the function should make this work.
First have to define a couple o
In [6]:
def makeDomainString(domains1, domains2, domine_dict, odds_dict):
domain_dict = {}
# Get all potential interactions
if len(domains1) > 0 and len(domains2) > 0:
potential_domain_pairs = getTwoListCombos(domains1,domains2)
else: potential_domain_pairs = []
# Get the domine information
domine_pairs = list(set([tuple(sorted(x)) for x in potential_domain_pairs if tuple(sorted(x)) in domine_dict]))
domain_dict['n_domine_pairs'] = len(domine_pairs)
domain_dict['highest_domine_conf'] = '0'
for pair in domine_pairs:
if domine_dict[pair] == 'HC': domain_dict['highest_domine_conf'] = 'HC'
elif domine_dict[pair] == 'MC' and domain_dict['highest_domine_conf'] != 'HC':
domain_dict['highest_domine_conf'] = 'MC'
elif domine_dict[pair] == 'LC' and domain_dict['highest_domine_conf'] not in ['HC','MC']:
domain_dict['highest_domine_conf'] = 'LC'
# Get the odds information
############## TEST ##################
domain_dict['lowest_odds'] = 0.
domain_dict['not_observed'] = 0
domain_dict['not_observed_frac'] = 1.
################################
domain_dict['sum_odds'] = 0.
domain_dict['highest_odds'] = 0.
domain_dict['n_odds_pairs'] = 0
for pair in potential_domain_pairs:
pair = tuple(list(sorted(pair)))
# Check if in the odds dictionary. If so, update domain variables
if pair in odds_dict:
domain_dict['sum_odds'] += odds_dict[pair]
if odds_dict[pair] > domain_dict['highest_odds']:
domain_dict['highest_odds'] = odds_dict[pair]
domain_dict['n_odds_pairs'] += 1
################### TEST ################
if odds_dict[pair] < domain_dict['lowest_odds']: domain_dict['lowest_odds'] = odds_dict[pair]
else:
domain_dict['not_observed'] += 1
if domain_dict['n_odds_pairs'] + domain_dict['not_observed'] > 0:
domain_dict['not_observed_frac'] = float(domain_dict['not_observed']) / (domain_dict['n_odds_pairs'] + domain_dict['not_observed'])
#########################################
domain_dict = [str(domain_dict[k]) for k in domain_cols]
return "\t".join(domain_dict)
In [7]:
def makeSubcellularDict(protein1, protein2, subcell_dict, protein_gene_dict = None):
gene1 = protein_gene_dict[protein1]
gene2 = protein_gene_dict[protein2]
svm_line1 = [subcell_dict[gene1]['predictions'][k] for k in svm_subcell_cols]
svm_line2 = [subcell_dict[gene2]['predictions'][k] for k in svm_subcell_cols]
svm_line1 += [subcell_dict[gene1]['svm_info'][k] for k in svm_detail_cols]
svm_line2 += [subcell_dict[gene2]['svm_info'][k] for k in svm_detail_cols]
svm_line = svm_line1 + svm_line2
return "\t".join([str(x) for x in svm_line])
In [8]:
def getfeaturevector(protein1, protein2, subcell_info, odds_dict, proteins, domine_interactions, \
protein_domain_dict, protein_gene_dict = None, verbose = True):
"""prototype function to retreive ENTS feature vectors for a given protein pair"""
keys = subcell_info.keys()
# Configure svm columns for organism
gene = keys[0]
delete_svm_pred_cols = [x for x in svm_subcell_cols if x not in subcell_info[gene]['predictions'].keys()]
delete_svm_detail_cols = [x for x in svm_detail_cols if x not in subcell_info[gene]['svm_info'].keys()]
for col in delete_svm_pred_cols: svm_subcell_cols.remove(col)
for col in delete_svm_detail_cols: svm_detail_cols.remove(col)
# Make the true interactions part of the file
try:
domain_string = makeDomainString(protein_domain_dict[protein1], protein_domain_dict[protein2],\
domine_interactions, odds_dict)
except:
key1 = [x for x in protein_domain_dict.keys() if protein1.startswith(x.split('.')[0])]
key2 = [x for x in protein_domain_dict.keys() if protein2.startswith(x.split('.')[0])]
if len(key1) == 0 or len(key2) == 0:
return None
else:
domain_string = makeDomainString(protein_domain_dict[key1[0]], protein_domain_dict[key2[0]],\
domine_interactions, odds_dict)
# Get the subcellular localization part of the string
try: subcell_string = makeSubcellularDict(protein1, protein2, subcell_info, protein_gene_dict)
except KeyError:
key1 = [x for x in protein_domain_dict.keys() if protein1.startswith(x.split('.')[0])]
key2 = [x for x in protein_domain_dict.keys() if protein2.startswith(x.split('.')[0])]
if len(key1) == 0 or len(key2) == 0:
return None
else:
subcell_string = makeSubcellularDict(key1[0], key2[0],subcell_info, protein_gene_dict)
#try: subcell_string = makeSubcellularDict(key1[0], key2[0],subcell_info, protein_gene_dict)
#except:
# return None
return domain_string, subcell_string
In [9]:
import itertools
In [10]:
from pred_file_maker import domain_cols, svm_subcell_cols, getTwoListCombos, svm_detail_cols
In [31]:
for pair in itertools.combinations(proteins,2):
fvector = getfeaturevector(pair[0],pair[1],subcell_info,odds_dict,proteins,
domine_interactions,protein_domain_dict,protein_gene_dict)
if fvector:
print fvector
break
So we can get feature vectors out of this for arbitrary protein pairs.
To return these feature vectors we require a class that integrates the above functions and stores the required data extracted from the files in this folder. This class can then be added to ocbio as with Gene Ontology and serve a custom generator for the feature vectors to be generated. The class will be instantiated and pickled as with the Gene Ontology notebook.
The features should also be returned as a list rather that a tab-delimited string as above. Also, it will have to be able to deal with Entrez IDs rather than Ensembl IDs, so it will need to store a 1 to 1 dictionary for this internally.
In [12]:
import sys
sys.path.append("/home/gavin/Documents/MRes/opencast-bio/")
In [13]:
import ocbio.ents
The dictionary used here is not one to one, and this must be taken into account in the code above. This dictionary is loaded from a pickled dictionary saved in notebook on Extracting InterologWalk features.
In [14]:
import pickle
In [15]:
import csv
In [18]:
cd ../../geneconversion/
In [19]:
f = open("human.gene2ensemble.pickle")
gentreztoensembl = pickle.load(f)
f.close()
Unfortunately, this dictionary doesn't map to all the proteins which we would like to be able to map to:
In [57]:
len(proteins)
Out[57]:
In [64]:
crossover = 0
for p in proteins:
if p in flatten(gentreztoensembl.values()):
crossover += 1
In [65]:
print crossover
In [66]:
print len(proteins)-crossover
So improving this dictionary with a BioMart csv. Writing a file for the web conversion service.
In [69]:
f = open("ents.proteins.ensembl.txt","w")
csv.writer(f,delimiter="\n").writerow(proteins)
f.close()
Downloaded the file biomart.ensembl2entrez.txt
, loading this and adding its entries to the dictionary:
In [77]:
f = open("biomart.ensembl2entrez.txt")
c = csv.reader(f,delimiter="\t")
c.next()
for line in c:
try:
gentreztoensembl[line[1]] += [line[0]]
except KeyError:
gentreztoensembl[line[1]] = [line[0]]
f.close()
Has that improved the situation?
In [78]:
crossover = 0
mappedproteins = list(flatten(gentreztoensembl.values()))
for p in proteins:
if p in mappedproteins:
crossover += 1
In [79]:
print crossover
In [80]:
print len(proteins)-crossover
In [21]:
cd ../ents/
In [22]:
reload(ocbio.ents)
Out[22]:
In [23]:
entsfeatures = ocbio.ents.ENTSfeatures(subcell_info,odds_dict,proteins,domine_interactions,
protein_domain_dict,protein_gene_dict,gentreztoensembl,
domain_cols,svm_subcell_cols,svm_detail_cols)
In [24]:
for pair in itertools.combinations(proteins,2):
fvector = entsfeatures.getfeaturevector(pair[0],pair[1])
if fvector:
print fvector
break
And testing calling with Entrez pairs as frozensets:
In [25]:
ensembltoentrez = {}
for k in gentreztoensembl.keys():
ensembltoentrez[gentreztoensembl[k][0]] = k
entrezlist = []
for p in proteins:
try:
entrezlist.append(ensembltoentrez[p])
except:
pass
In [26]:
for pair in itertools.combinations(entrezlist,2):
try:
fvector = entsfeatures[frozenset(pair)]
except KeyError:
continue
if fvector:
print fvector
break
In [27]:
f = open("../DIP/human/training.nolabel.positive.Entrez.txt")
dippairs = list(map(lambda x: frozenset(x),csv.reader(f,delimiter="\t")))
f.close()
In [90]:
highest_domine_conf = []
for pair in dippairs:
try:
fvector = entsfeatures[pair]
highest_domine_conf.append(fvector[7])
except KeyError as inst:
continue
In [30]:
print len(fvector)
The feature vectors produced contain some strings that are not explained in the documentation.
In the above feature vector, for instance, there is a string 'HC' in the column that should be highest_domine_conf
.
This should be:
The highest confidence score for a pair of domains verified to interact in the DOMINE database.
The string will cause problems when trying to train the classifier as that cannot take a string as input. Other columns have also been observed causing problems as can be seen in older versions of the Classifier Training notebook.
To deal with the highest_domine_conf
column we first should look at what the different options are:
In [84]:
print set(highest_domine_conf)
Assuming that those mean:
Then these are categories and we can use 1-of-k encoding.
The ocbio.ents
code will implement this transformation.
Above debugging logs in this notebook will not run anymore.
In [99]:
reload(ocbio.ents)
Out[99]:
In [100]:
entsfeatures = ocbio.ents.ENTSfeatures(subcell_info,odds_dict,proteins,domine_interactions,
protein_domain_dict,protein_gene_dict,gentreztoensembl,
domain_cols,svm_subcell_cols,svm_detail_cols)
In [106]:
highest_domine_conf = []
for pair in dippairs:
try:
fvector = entsfeatures[pair]
highest_domine_conf.append(fvector[7:11])
except KeyError as inst:
continue
In [107]:
highest_domine_conf = map(tuple, highest_domine_conf)
print set(highest_domine_conf)
In [116]:
import scipy.misc
In [118]:
total = scipy.misc.comb(len(gentreztoensembl.keys()),2)
In [131]:
stringdict = {}
lcount = 0
for pair in itertools.combinations(gentreztoensembl.keys(),2):
try:
fvector = entsfeatures[pair]
except KeyError:
pass
#look in this vector for strings
try:
vecfloats = map(float,fvector)
except:
#must be a string in there
#where is it?
for i,x in enumerate(fvector):
try:\
float(x)
except:
try:
stringdict[i] += [x]
except KeyError:
stringdict[i] = [x]
if lcount%int(total/1000)==0:
print lcount
lcount +=1
Had to interrupt the above loop as it had already been running for over 12 hours. Tested 62 million pairs though and didn't see a single string, as shown below:
In [132]:
print stringdict.keys()
In [134]:
cd ../ents/
In [135]:
!git annex unlock human.ENTS.features.pickle
In [136]:
f = open("human.ENTS.features.pickle","wb")
pickle.dump(entsfeatures,f)
f.close()
In [137]:
f = open("human.ENTS.features.labels.txt","w")
csv.writer(f,delimiter="\n").writerow(domain_cols+svm_subcell_cols+svm_detail_cols)
f.close()